/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the datasets that are to be fed into R 
					to create the ensemble predictions. 
					
					This dataset saves the data pooling across several years (set
					as y0 and y1).
				
*******************************************************************************/

clear all
global id_code 002

* Set the pooled samples we want to create:
global pool_start 2006 2009
global pool_end 2007 2010

************************************************************************
* 1. Preparation to split the dataset
************************************************************************

* Importing data
use "$data/001_9_FinalMainDataset.dta", clear
drop L_Occupation*

* Generating random variable so that training sample is a random sample
set seed 2111
gen double random = runiform()
isid random // Ensure random gives a unique ordering of data
sort random
drop random
gen n = _n

* Add year and quarter dummies:
gen q_startU = quarter(startU)
gen y_startU = year(startU)

* Creating dummy variables for non-ordinal categorical variables
foreach var in L_Industry_3digit L_civilStatus EducLevel L_Municipality ///
	L_Age_Youngest citizenship migrationCohort q_startU y_startU {
	tab `var', gen(`var'_dummy)
	
	if "`var'" == "y_startU" {
		local n_years = r(r)
		
		sum y_startU
		local min_years = r(min)
	}
	
	drop `var'
}

* Rename year dummies:
forval i = 1/`n_years' {
	rename y_startU_dummy`i' y_startU_dummy`=`i' + `min_years' - 1'
}



************************************************************************
* 2. Sample preparation for each model
************************************************************************
pause off
local model Full
local sampleName Full
local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist q_startU_dummy* y_startU_dummy`y0'-y_startU_dummy`y1'"

local samples: word count $pool_start
forval s = 1/`samples' {
	
	* Parse globals:
	local y0: word `s' of $pool_start
	local y1: word `s' of $pool_end
		
	preserve 

	* Keep only observations from a particular year (note, this isn't done for months 1, 2, 4, 5, etc.)		************************************************************************
	gen temp = 0

	forvalues i = 0(3)12{

		replace emplAft3M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y0'),	///
			d(31dec`y1'))
		replace emplAft6M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y0'),	///
			d(31dec`y1'))
			
		replace temp = 1 if emplAft3M_`i'M_In!=.
	}	

	replace emplAft1M_0M_In = . if !inrange(startU, d(01jan`y0'), d(31dec`y1'))

	replace emplAft12M_0M_In = . if !inrange(startU, d(01jan`y0'), d(31dec`y1'))

	drop if temp==0
	drop temp


	************************************************************************
	* Keep only observations and variables that belong to the sample
	************************************************************************
	* this is indicated by variable inSample_`sampleName' generate in code 001_09
	keep if inSample_`sampleName' == 1
	pause

	* keep only variables that belong to the model
	keep LopNr_PersonNr InLnr `varSet' inSample_`sampleName' emplAft* n 
	pause


	************************************************************************
	* Prepare and save the sample
	************************************************************************

	* Generate variable used to split dataset into parameter tuning, training
	* and prediction partitions.
	sort n
	gen n_order = _n	

	rename inSample_`sampleName' inSample

	* The last variables should be covariates starting from gender 
	* This is used to tell R which variables should be included in the model
	order LopNr_PersonNr InLnr n n_order inSample emplAft* Gender, first

	compress

	save "${data}/${id_code}_DataForR_`model'_Pooled_`y0'_`y1'.dta", replace

	restore

}



